######    code imports data from sources and combines into dataset for analysis   #######
###      using code results.R
###     Uses files: import_IPEDS[year].csv for data on Carnegie codes, expenditures (inputs),
###                                                   and enrollment (output)
###                 import_MUP_TotResearch_current.csv    for data on research grants
###                                                           (output)
###                 import_ACUPCC[year].csv    for data on emissions and hdd/cdd
###                                                           (output)
###                 import_key.csv   as a key to match schools from IPEDS data to MUP and ACUPCC data
###
###     Code assumes that files are arranged in the root directory of the project in folders as per the TIER protocol

###   Set the root directory of the project here, eh?
setwd("/Users/michael/Desktop/Working")

year <- 2007

echo=FALSE;

###   macbook
#dat.mat <- read.table("/Users/michaelohara/Desktop/carbon efficiency/USSEEdata/data2805min.csv", header = TRUE, sep = ",")

###   work station
###   These are set as paths relative to the root directory

iped.mat <- read.table((paste(getwd(),"/data/importable/import_IPEDS", year,".csv",sep="")), header = TRUE, sep = ",", stringsAsFactors = FALSE)
totres.mat <- read.table((paste(getwd(),"/data/importable/import_MUP_TotResearch_current.csv",sep="")), header = TRUE, sep = ",", stringsAsFactors = FALSE)
pcc.mat <- read.table((paste(getwd(),"/data/importable/import_ACUPCC",year,".csv",sep="")), header = TRUE, sep = ",", stringsAsFactors = FALSE)
namekey <- read.table((paste(getwd(),"/data/importable/import_key.csv",sep="")), header = TRUE, sep = ",", stringsAsFactors = FALSE)

############################
############################
##  Create dataframe from IPEDS data
##  This will include expenditure data and enrollment
##  As well as school name and identifier code

attach(iped.mat)

##  First, create expenditure variables by adding together GASB and FASB versions, 
##  And then multiplying by FTE enrollment, since expenditures in data are all per FTE

InstrExp = (Instruction.expenses.per.FTE...GASB. + Instruction.expenses.per.FTE...FASB.) *         Full.time.equivalent.enrollment
                              
ResExp = (Research.expenses.per.FTE...GASB. + Research.expenses.per.FTE..FASB.) * Full.time.equivalent.enrollment

###  Reclassify Carnegie classifications into simpler groups
CarnClass <- ifelse(Carnegie.Classification..Basic == "Research Universities (very high research activity)", "R1", 
   (ifelse(Carnegie.Classification..Basic == "Research Universities (high research activity)", "R2",
   (ifelse(Carnegie.Classification..Basic == "Doctoral/Research Universities", "R",
   (ifelse(Carnegie.Classification..Basic == "Baccalaureate Colleges--Arts & Sciences", "AS",
   (ifelse(Carnegie.Classification..Basic == "Masters Colleges and Universities (larger programs)", "MastLarge",
   (ifelse(Carnegie.Classification..Basic == "Masters Colleges and Universities (medium programs)", "MastMed",  
   (ifelse(Carnegie.Classification..Basic == "Masters Colleges and Universities (smaller programs)", "MastSmall",
   "NA")
   #HD2007.Carnegie.Classification.2005..Basic)  
))))))))))))
  


iped.df <- data.frame(unitid = unitid
                      ,instnm = institution.name    ##  identifiers
                      ,CarnClass = CarnClass
                      ,InstrExp = InstrExp
                      ,ResExp = ResExp     
                      ,FTE = Full.time.equivalent.enrollment
                      ,stringsAsFactors = FALSE)       
detach(iped.mat)

iped.df <- subset(iped.df, CarnClass != "NA")
iped.df <- iped.df[order(iped.df$instnm),]
###############################
###############################

####################################################################################

###############################
###############################
###   Create dataframe with total research grant data from MUP
if(year == 2011){res.year <- 2010}else{res.year <- year}

attach(totres.mat)

##  First, create expenditure variables by adding together GASB and FASB versions, 
##  And then multiplying by FTE enrollment, since expenditures in data are all per FTE


totres.df <- data.frame(instnm = Institutions.Reporting.Any.Federal.Research.in.Past.Five.Years..in.Alphabetical.Order.   
                      ,TotRes = get(paste("X",res.year,".....Total.Research...x..1000",sep = "")) 
                      ,stringsAsFactors = FALSE)       
detach(totres.mat)

################################
################################
##  Assign unitid codes to schools in MUP data
##  using the key file
   
tr.unitid = rep("NA",(nrow(totres.df)))

for(j in 1:nrow(totres.df)){
    for (i in 1:nrow(namekey)){  #print(totres.df$instnm[j]); print(namekey$name[i]); print(namekey$alt1[i])
	     if((totres.df$instnm[j] == (namekey$name[i]))|(totres.df$instnm[j] == (namekey$alt1[i])) |(totres.df$instnm[j] == (namekey$alt2[i]))){tr.unitid[j] <- namekey$unitid[i]}
         }         
}

totres.df <- cbind(totres.df, unitid = tr.unitid)
totres.df <- subset(totres.df, tr.unitid != "NA")
totres.df <- totres.df[order(totres.df$unitid),]

###############################
###############################
###   Create dataframe with just totemiss12 from ACUPCC data


#pcc.mat <- subset(pcc.mat, year == year)

attach(pcc.mat)

##  Only need the emissions data from this for now 


pcc.df <- data.frame(instnm #= schoolname   
                      ,totemiss12 = totemiss12
                      #,hdd = hdd_noaa
                      #,cdd = cdd_noaa 
                      ,stringsAsFactors = FALSE)       
detach(pcc.mat)

################################
################################
##  Assign unitid codes to schools in ACUPCC data
##  using the key file


   
tr.unitid = rep("NA",(nrow(pcc.df)))

for(j in 1:nrow(pcc.df)){
    for (i in 1:nrow(namekey)){  #print(totres.df$instnm[j]); print(namekey$name[i]); print(namekey$alt1[i])
	     if((pcc.df$instnm[j] == (namekey$name[i]))|(pcc.df$instnm[j] == (namekey$alt1[i])) |(pcc.df$instnm[j] == (namekey$alt2[i]))){tr.unitid[j] <- namekey$unitid[i]}
         }         
}

pcc.df <- cbind(pcc.df, unitid = tr.unitid)
pcc.df <- subset(pcc.df, tr.unitid != "NA")
pcc.df <- pcc.df[order(pcc.df$unitid),]


################################
################################
##  Merge the IPEDs data with the MUP data
   
iped.mup.df <- merge(iped.df, totres.df, by = "unitid")

################################
################################
##  Merge the iped.mup dataframe with the ACUPCC dataframe
   
iped.mup.pcc.df <- merge(iped.mup.df, pcc.df, by = "unitid")

###  Clean up final dataframw by removing redundant columns

final.data <- iped.mup.pcc.df[!names(iped.mup.pcc.df) %in% c("instnm.y", "instnm")]
names(final.data)[names(final.data)=="instnm.x"] <- "instnm"

##  put them in alphabetical order, eh?
##  makes it easier to find the school you want in the list
final.data <- final.data[order(final.data$instnm),]

###  output final table to a file in the final data folder
###  This is again a relative path and assumes the directories are set up TIER style
###  according to the readme file
     
    write.table(final.data, (paste(getwd(),"/data/final/final_data", year, ".csv",sep="")),row.names = FALSE, sep = ",")      







 
